!pip install statsmodels --quiet
!pip install opendatasets --upgrade --quiet
!pip install catboost --quiet
!pip install imblearn --quiet
import opendatasets as od
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, roc_auc_score, precision_recall_curve
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
%matplotlib inline
od.download('https://www.kaggle.com/blastchar/telco-customer-churn')
Skipping, found downloaded files in "./telco-customer-churn" (use force=True to force download)
filepath = './telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(filepath)
print(df.shape)
df.sample(3)
(7043, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5752 | 7480-SPLEF | Male | 0 | Yes | Yes | 69 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | Yes | Bank transfer (automatic) | 26.00 | 1796.55 | No |
| 3085 | 5052-PNLOS | Male | 0 | No | No | 3 | Yes | Yes | Fiber optic | No | ... | Yes | No | Yes | Yes | Month-to-month | Yes | Bank transfer (automatic) | 105.35 | 323.25 | Yes |
| 6337 | 2696-ECXKC | Female | 0 | Yes | Yes | 55 | Yes | Yes | Fiber optic | Yes | ... | No | No | Yes | Yes | One year | No | Mailed check | 100.90 | 5448.6 | No |
3 rows × 21 columns
df.columns
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
dtype='object')
df.dtypes
customerID object gender object SeniorCitizen int64 Partner object Dependents object tenure int64 PhoneService object MultipleLines object InternetService object OnlineSecurity object OnlineBackup object DeviceProtection object TechSupport object StreamingTV object StreamingMovies object Contract object PaperlessBilling object PaymentMethod object MonthlyCharges float64 TotalCharges object Churn object dtype: object
TotalCharges type is string object. Convert it to float type.
print(df['TotalCharges'].dtype)
print(df.TotalCharges.values)
object ['29.85' '1889.5' '108.15' ... '346.45' '306.6' '6844.5']
There are 11 records with missing values(empty string) in TotalCharges column. Since it is only a small proportion (0.15% of the dataset), those records are dropped.
print('No. of records with empty string: ',df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()].shape[0])
#index of the TotalCharges with missing values
idxlst = list(df[pd.to_numeric(df.TotalCharges,errors='coerce').isnull()].index)
idxlst
No. of records with empty string: 11
[488, 753, 936, 1082, 1340, 3331, 3826, 4380, 5218, 6670, 6754]
# below results show that totalcharges columns have empty string
df.loc[idxlst,['tenure','MonthlyCharges','TotalCharges','Churn']]
| tenure | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|
| 488 | 0 | 52.55 | No | |
| 753 | 0 | 20.25 | No | |
| 936 | 0 | 80.85 | No | |
| 1082 | 0 | 25.75 | No | |
| 1340 | 0 | 56.05 | No | |
| 3331 | 0 | 19.85 | No | |
| 3826 | 0 | 25.35 | No | |
| 4380 | 0 | 20.00 | No | |
| 5218 | 0 | 19.70 | No | |
| 6670 | 0 | 73.35 | No | |
| 6754 | 0 | 61.90 | No |
df[df.tenure==0]
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | ... | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 488 | 4472-LVYGI | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | Yes | Bank transfer (automatic) | 52.55 | No | |
| 753 | 3115-CZMZD | Male | 0 | No | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.25 | No | |
| 936 | 5709-LVOEQ | Female | 0 | Yes | Yes | 0 | Yes | No | DSL | Yes | ... | Yes | No | Yes | Yes | Two year | No | Mailed check | 80.85 | No | |
| 1082 | 4367-NUYAO | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.75 | No | |
| 1340 | 1371-DWPAZ | Female | 0 | Yes | Yes | 0 | No | No phone service | DSL | Yes | ... | Yes | Yes | Yes | No | Two year | No | Credit card (automatic) | 56.05 | No | |
| 3331 | 7644-OMVMY | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 19.85 | No | |
| 3826 | 3213-VVOLG | Male | 0 | Yes | Yes | 0 | Yes | Yes | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 25.35 | No | |
| 4380 | 2520-SGTTA | Female | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | Two year | No | Mailed check | 20.00 | No | |
| 5218 | 2923-ARZLG | Male | 0 | Yes | Yes | 0 | Yes | No | No | No internet service | ... | No internet service | No internet service | No internet service | No internet service | One year | Yes | Mailed check | 19.70 | No | |
| 6670 | 4075-WKNIU | Female | 0 | Yes | Yes | 0 | Yes | Yes | DSL | No | ... | Yes | Yes | Yes | No | Two year | No | Mailed check | 73.35 | No | |
| 6754 | 2775-SEFEE | Male | 0 | No | Yes | 0 | Yes | Yes | DSL | Yes | ... | No | Yes | No | No | Two year | Yes | Bank transfer (automatic) | 61.90 | No |
11 rows × 21 columns
df.Churn.value_counts()
No 5174 Yes 1869 Name: Churn, dtype: int64
There are 11 records with missing values for total charges. All missing records are also customers with No churn as well as with Zero tenure. As there are significant samples for No Churn (majority class), these 11 missing records are dropped.
#drop the missing records/rows
df1 = df.drop(idxlst,axis=0)
#drop customerID column
df1 = df1.drop('customerID',axis=1)
df1.shape
(7032, 20)
#convert from string object to float64
df1 = df1.astype({'TotalCharges':np.float64})
df1.TotalCharges.dtypes
dtype('float64')
Statistics of the numeric features - dataset contains tenure range from 1 to 72 and montlycharges rage from 18 to 118.
df1.describe()
| SeniorCitizen | tenure | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|
| count | 7032.000000 | 7032.000000 | 7032.000000 | 7032.000000 |
| mean | 0.162400 | 32.421786 | 64.798208 | 2283.300441 |
| std | 0.368844 | 24.545260 | 30.085974 | 2266.771362 |
| min | 0.000000 | 1.000000 | 18.250000 | 18.800000 |
| 25% | 0.000000 | 9.000000 | 35.587500 | 401.450000 |
| 50% | 0.000000 | 29.000000 | 70.350000 | 1397.475000 |
| 75% | 0.000000 | 55.000000 | 89.862500 | 3794.737500 |
| max | 1.000000 | 72.000000 | 118.750000 | 8684.800000 |
What is the percentage (%) of customers who churn? 26.6% churn, 73.4% not churn.
t = df1.Churn.value_counts(normalize=True).rename('Percent').reset_index().rename(columns={'index':'Churn'})
fig = go.Figure(data=[go.Pie(labels=t.Churn.to_list(), values=t.Percent.to_list(), hole=.5)])
fig.update_layout(
title_text="Percent(%) of customers who churn",
title_x=0.5,
width=400, height=400,
titlefont= { "size": 14},
)
fig.update_layout(legend_title_text='Churn')
fig.show()
Figure below shows that 'Month-to-Month' contract type had the highest churn percentage whereas 'two year' contract type had the lowest churn percentage.
pc_by_contract = df1[['Contract','Churn']].value_counts(normalize=True).rename('Percent').reset_index()
pc_by_contract.Percent = round((pc_by_contract.Percent*100),2)
pc_by_contract.Churn = pc_by_contract.Churn.astype(str)
fig = px.bar(
pc_by_contract.sort_values(by='Contract'),
x='Contract',
y='Percent',
color='Churn',
barmode='group', # group category rather than stacked bar
color_discrete_map={ 'Yes': 'red','No': 'green'},
height=500,
width = 500,
title="Percent(%) of churn by contract type",
text = 'Percent'
)
fig.update_layout(title_x=0.5,uniformtext_minsize=6, titlefont = {'size':16})
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.show()
Figure below shows that customers with low tenure (first 6 months) had higher churn rate and the churn rate decreases as the tenure increases.
pc_by_tenure = df1[['tenure','Churn']].value_counts(normalize=True).rename('Percent').reset_index()
pc_by_tenure.Percent = round(pc_by_tenure.Percent *100,2)
pc_by_tenure.Churn = pc_by_tenure.Churn.astype(str)
min_tenure = pc_by_tenure.tenure.min()
max_tenure = pc_by_tenure.tenure.max()
fig = px.histogram(
pc_by_tenure,
x='tenure',
y = 'Percent',
color='Churn',
barmode='group',
color_discrete_map={ 'Yes': 'red','No': 'green'},
height=400,
)
fig.update_traces(xbins=dict(start=min_tenure,end=max_tenure,size=6))
fig.update_layout(
title = 'Percent(%) of Churn by Tenure (6 months interval)', bargap=0.03, title_x=0.5,
xaxis_title="Tenure (6 months interval)", yaxis_title="Percent (%)",
xaxis = dict( tickmode = 'linear', tick0 = min_tenure, dtick = 6),
titlefont = {'size':14}
)
fig.show()
Below figure shows that Month-to-Month Contract type with low tenure duration has the highest churn percentage. Trendline for month-to-month contract type falls over tenure time. Trendline for one year and two year contract type seems to be flat.
pc_con_ten = df1[['Contract','tenure','Churn']].value_counts(normalize = True).rename('Percent').reset_index()
pc_con_ten = pc_con_ten.sort_values(by=['Contract'])
pc_con_ten['Percent'] = round((pc_con_ten['Percent'] * 100), 2)
fig = px.scatter(pc_con_ten,
x="tenure",
y="Percent",
facet_col="Contract",
color="Churn",
color_discrete_map={ 'Yes': 'red','No': 'green'},
trendline="lowess",
trendline_options=dict(frac=0.1), # level of smoothing
height=400,
)
fig.for_each_annotation(lambda a: a.update(text = a.text.replace('Contract=',"")))
fig.update_layout( title = 'Trendline for Churn distribution by tenure and contract type',
title_x=0.5, titlefont={'size':16} )
fig.show()
33.6% of customers use Electronic check payment method. But other methods such as Mail check, credit card and bank transfer payment methods are similarly used by customers.
t = df1[['PaymentMethod','Churn']].copy()
t = t.value_counts(normalize=True).rename('Percent').reset_index()
t['Percent'] = round((t['Percent']*100),2)
t.PaymentMethod.replace(['Credit card (automatic)','Bank transfer (automatic)','Mailed check','Electronic check'],
['Credit','Bank Transfer','Mail','Electronic'],inplace=True)
fig = go.Figure(data=[go.Pie(labels=t.PaymentMethod.to_list(), values=t.Percent.to_list(), hole=.5)])
fig.update_layout(
title_text="Percent(%) of payment method",
title_x=0.5,
width=400, height=400,
titlefont= { "size": 16},
)
fig.show()
Among customers who churn, customers using the electric check payment method has the highest rate (over 15%).
fig = px.sunburst(t, path=[ 'Churn','PaymentMethod'],
values='Percent',color = 'Churn',
color_discrete_map = {'Yes':'red','No':'green'},
height=450, width=450,
)
fig.update_layout(title="Customers by payment methods",title_x=0.5, titlefont = {'size':14})
#fig.update_traces(hovertemplate='<b> %{label} </b> <br> Percent: %{value}')
#fig.update_traces(textinfo='label+percent parent')
fig.update_traces(texttemplate = "<b> %{label}</b> <br> %{value}%")
fig.show()
Customers with month-to-month contract type using electric payment method contributes the highest churn rate.
pc_by_pay = df1[['PaymentMethod','Contract','Churn']].value_counts(normalize=True).rename('Percent').reset_index()
pc_by_pay.Percent = round((pc_by_pay.Percent*100),2)
pc_by_pay.sort_values(by='Contract',inplace=True)
fig = px.bar(pc_by_pay,
x = 'PaymentMethod',
y = 'Percent',
text = 'Percent',
barmode='group',
facet_col='Contract',
color = 'Churn',
color_discrete_map={ 'Yes': 'red','No': 'green'},
title = '% of Churn by PaymentMethod by Contract type'
)
fig.update_layout(title_x=0.5)
fig.update_traces(texttemplate='%{text:.2f}%', textposition='outside')
fig.for_each_annotation(lambda a: a.update(text= a.text.replace('Contract=','')))
fig.show()
Customers with monthly charges range between 68 and 108 have significantly higher churn rate.
pc_by_mc = df1[['MonthlyCharges','Churn']].value_counts(normalize=True).rename('Percent').reset_index()
pc_by_mc['Percent'] = round(pc_by_mc['Percent']*100,2)
pc_by_mc.Churn = pc_by_mc.Churn.astype(str)
min_mc = pc_by_mc.MonthlyCharges.min()
max_mc = pc_by_mc.MonthlyCharges.max()
fig = px.histogram(pc_by_mc,
x='MonthlyCharges',
y = 'Percent',
barmode='group',
height=400,
color = 'Churn',
color_discrete_map={ 'Yes': 'red','No': 'green'}
)
fig.update_traces(xbins=dict(start=min_mc,end=max_mc,size=10))
fig.update_layout(
title = 'Percent(%) of Churn by MonthlyCharges', bargap=0.03, title_x=0.5,
xaxis_title="MonthlyCharges ($10 interval)", yaxis_title="Percent (%)",
titlefont = {'size':16}
)
fig.show()
Tranform categorical data to numerical data.
telcom_df = df1.copy()
telcom_df.sample(3)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5842 | Female | 0 | No | No | 13 | Yes | Yes | DSL | Yes | No | Yes | No | Yes | Yes | Month-to-month | Yes | Electronic check | 78.75 | 995.35 | No |
| 5944 | Male | 0 | No | No | 12 | Yes | No | DSL | Yes | No | No | Yes | No | No | Month-to-month | No | Mailed check | 56.65 | 654.85 | Yes |
| 2918 | Male | 1 | Yes | No | 8 | No | No phone service | DSL | No | Yes | Yes | No | No | Yes | Month-to-month | Yes | Mailed check | 43.35 | 371.40 | No |
# unique values of categorical features
def print_unique_cols(df):
for col in df.columns:
if df[col].dtypes=='O':
print(f'{col} = {df[col].unique()}')
print_unique_cols(telcom_df)
gender = ['Female' 'Male'] Partner = ['Yes' 'No'] Dependents = ['No' 'Yes'] PhoneService = ['No' 'Yes'] MultipleLines = ['No phone service' 'No' 'Yes'] InternetService = ['DSL' 'Fiber optic' 'No'] OnlineSecurity = ['No' 'Yes' 'No internet service'] OnlineBackup = ['Yes' 'No' 'No internet service'] DeviceProtection = ['No' 'Yes' 'No internet service'] TechSupport = ['No' 'Yes' 'No internet service'] StreamingTV = ['No' 'Yes' 'No internet service'] StreamingMovies = ['No' 'Yes' 'No internet service'] Contract = ['Month-to-month' 'One year' 'Two year'] PaperlessBilling = ['Yes' 'No'] PaymentMethod = ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)'] Churn = ['No' 'Yes']
telcom_df.gender.replace({'Female':0,'Male':1},inplace=True)
bicols = ['Partner','Dependents','PhoneService','PaperlessBilling','Churn']
for c in bicols:
telcom_df[c].replace({'Yes':1,'No':0}, inplace= True)
for c in telcom_df.columns:
print(f'{c} = {telcom_df[c].unique()}')
gender = [0 1] SeniorCitizen = [0 1] Partner = [1 0] Dependents = [0 1] tenure = [ 1 34 2 45 8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27 5 46 11 70 63 43 15 60 18 66 9 3 31 50 64 56 7 42 35 48 29 65 38 68 32 55 37 36 41 6 4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26 39] PhoneService = [0 1] MultipleLines = ['No phone service' 'No' 'Yes'] InternetService = ['DSL' 'Fiber optic' 'No'] OnlineSecurity = ['No' 'Yes' 'No internet service'] OnlineBackup = ['Yes' 'No' 'No internet service'] DeviceProtection = ['No' 'Yes' 'No internet service'] TechSupport = ['No' 'Yes' 'No internet service'] StreamingTV = ['No' 'Yes' 'No internet service'] StreamingMovies = ['No' 'Yes' 'No internet service'] Contract = ['Month-to-month' 'One year' 'Two year'] PaperlessBilling = [1 0] PaymentMethod = ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)'] MonthlyCharges = [29.85 56.95 53.85 ... 63.1 44.2 78.7 ] TotalCharges = [ 29.85 1889.5 108.15 ... 346.45 306.6 6844.5 ] Churn = [0 1]
print_unique_cols(telcom_df)
MultipleLines = ['No phone service' 'No' 'Yes'] InternetService = ['DSL' 'Fiber optic' 'No'] OnlineSecurity = ['No' 'Yes' 'No internet service'] OnlineBackup = ['Yes' 'No' 'No internet service'] DeviceProtection = ['No' 'Yes' 'No internet service'] TechSupport = ['No' 'Yes' 'No internet service'] StreamingTV = ['No' 'Yes' 'No internet service'] StreamingMovies = ['No' 'Yes' 'No internet service'] Contract = ['Month-to-month' 'One year' 'Two year'] PaymentMethod = ['Electronic check' 'Mailed check' 'Bank transfer (automatic)' 'Credit card (automatic)']
cat_col = ['MultipleLines','InternetService','OnlineSecurity','OnlineBackup',
'DeviceProtection','TechSupport','StreamingTV','StreamingMovies',
'Contract','PaymentMethod']
telcom_df = pd.get_dummies(data=telcom_df,columns=cat_col)
telcom_df.sample(3)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | PaperlessBilling | MonthlyCharges | TotalCharges | Churn | ... | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 789 | 1 | 0 | 0 | 0 | 72 | 1 | 0 | 69.65 | 4908.25 | 0 | ... | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 3684 | 1 | 0 | 0 | 0 | 43 | 1 | 1 | 75.35 | 3161.40 | 0 | ... | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
| 5488 | 1 | 1 | 1 | 0 | 45 | 1 | 1 | 98.70 | 4525.80 | 0 | ... | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
3 rows × 41 columns
telcom_df.nunique()
gender 2 SeniorCitizen 2 Partner 2 Dependents 2 tenure 72 PhoneService 2 PaperlessBilling 2 MonthlyCharges 1584 TotalCharges 6530 Churn 2 MultipleLines_No 2 MultipleLines_No phone service 2 MultipleLines_Yes 2 InternetService_DSL 2 InternetService_Fiber optic 2 InternetService_No 2 OnlineSecurity_No 2 OnlineSecurity_No internet service 2 OnlineSecurity_Yes 2 OnlineBackup_No 2 OnlineBackup_No internet service 2 OnlineBackup_Yes 2 DeviceProtection_No 2 DeviceProtection_No internet service 2 DeviceProtection_Yes 2 TechSupport_No 2 TechSupport_No internet service 2 TechSupport_Yes 2 StreamingTV_No 2 StreamingTV_No internet service 2 StreamingTV_Yes 2 StreamingMovies_No 2 StreamingMovies_No internet service 2 StreamingMovies_Yes 2 Contract_Month-to-month 2 Contract_One year 2 Contract_Two year 2 PaymentMethod_Bank transfer (automatic) 2 PaymentMethod_Credit card (automatic) 2 PaymentMethod_Electronic check 2 PaymentMethod_Mailed check 2 dtype: int64
scale_col = ['tenure','MonthlyCharges','TotalCharges']
sc = MinMaxScaler()
telcom_df[scale_col]= sc.fit_transform(telcom_df[scale_col])
telcom_df.sample(3)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | PaperlessBilling | MonthlyCharges | TotalCharges | Churn | ... | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | Contract_Month-to-month | Contract_One year | Contract_Two year | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4726 | 0 | 0 | 1 | 0 | 1.000000 | 0 | 1 | 0.307960 | 0.411049 | 0 | ... | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 4374 | 1 | 0 | 1 | 1 | 0.985915 | 1 | 1 | 0.013433 | 0.157933 | 0 | ... | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
| 3108 | 0 | 0 | 0 | 1 | 0.591549 | 1 | 1 | 0.873134 | 0.520829 | 0 | ... | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
3 rows × 41 columns
# Assign input features to X and output to y.
X = telcom_df.drop(columns=['Churn'])
y = telcom_df['Churn']
# Split dataset into training and test sets.
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.2,random_state=1,stratify=y)
print(f'train = {train_X.shape}')
print(f'test = {test_X.shape}')
train = (5625, 40) test = (1407, 40)
from keras import regularizers
from keras.optimizers import SGD
def test_eval(clf_model, testX, testy):
# Test set prediction
y_pred=clf_model.predict(testX)
y_predict = []
for p in y_pred:
if p > 0.5:
y_predict.append(1)
else:
y_predict.append(0)
print('Confusion Matrix')
print('='*60)
cm = confusion_matrix(testy,y_predict)
print(cm,"\n")
print('Classification Report')
print('='*60)
cr = classification_report(testy,y_predict)
print(cr,"\n")
print('AUC-ROC')
print('='*60)
score = roc_auc_score(testy, y_pred)
print(score)
return cm,cr,score,y_predict
def plot_cmatrix(cmatrix, name):
fig = px.imshow(cmatrix,color_continuous_scale=[[0.0, 'peachpuff'],
[0.5, 'lime'],
[1.0, 'green']])
fig.update_layout(
title = 'Confusion Matrix ('+ name + ')', title_x=0.5,
xaxis_title="Prediction", yaxis_title="Truth",
height=400,
width=400,
xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
yaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1)
)
for i in range(len(cmatrix)):
for j in range(len(cmatrix[0])):
fig.add_annotation(
x=j,
y=i,
text=str(cmatrix[i,j]),
showarrow=False,
font_size=12, font_color='black'
)
fig.show()
def plot_metrics(history):
metrics = ['loss', 'accuracy']
plt.figure(figsize=(12,3))
for n, metric in enumerate(metrics):
name = metric.replace("_"," ").capitalize()
plt.subplot(1,2,n+1)
plt.plot(history.epoch, history.history[metric], color='red', label='Train')
plt.plot(history.epoch, history.history['val_'+metric],
color='green', linestyle="--", label='Val')
plt.xlabel('Epoch')
plt.ylabel(name)
plt.legend()
sgd = SGD(learning_rate=0.0081, decay=1e-6, momentum=0.8, nesterov=True)
es = EarlyStopping(monitor='val_accuracy',
mode='max',
patience=12,
restore_best_weights=True
)
nepochs = 1000
nbsize = 35
def build_model(opt):
model_NN1 = Sequential()
model_NN1.add(Dense(40,input_shape=(train_X.shape[1],),activation='relu',kernel_initializer = 'uniform',
kernel_regularizer = regularizers.l2(0.005)))
model_NN1.add(Dense(10,activation = 'relu',kernel_initializer = 'uniform',
kernel_regularizer = regularizers.l2(0.005)))
model_NN1.add(Dense(1,kernel_initializer = 'uniform', activation='sigmoid'))
print(model_NN1.summary())
model_NN1.compile(optimizer=opt,metrics=['accuracy'],loss='binary_crossentropy')
return model_NN1
model_NN1 = build_model(sgd)
Model: "sequential" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense (Dense) (None, 40) 1640 _________________________________________________________________ dense_1 (Dense) (None, 10) 410 _________________________________________________________________ dense_2 (Dense) (None, 1) 11 ================================================================= Total params: 2,061 Trainable params: 2,061 Non-trainable params: 0 _________________________________________________________________ None
history = model_NN1.fit(train_X,train_y,
callbacks = [es],
validation_split=0.2,
epochs=nepochs,
batch_size= nbsize,
verbose=0)
plot_metrics(history)
#history_dict = history.history
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
# the average, HIGHEST train accuracy
print(np.mean(val_acc),np.max(val_acc))
0.7873250985587085 0.8124444484710693
model_NN1.evaluate(test_X,test_y)
44/44 [==============================] - 0s 1ms/step - loss: 0.4499 - accuracy: 0.7939
[0.4499203860759735, 0.7938876748085022]
The result shows that the accuracy score is pretty high, but the recall score is slightly lower. Hence, performance of the model to predict the minority class (customers who Churn) is not good enough.
cm,cr,auc_score,_ = test_eval(model_NN1,test_X,test_y)
fpr, tpr, thresh = roc_curve(test_y, model_NN1.predict(test_X), pos_label = 1)
precision, recall, _ = precision_recall_curve(test_y, model_NN1.predict(test_X))
Confusion Matrix
============================================================
[[925 108]
[182 192]]
Classification Report
============================================================
precision recall f1-score support
0 0.84 0.90 0.86 1033
1 0.64 0.51 0.57 374
accuracy 0.79 1407
macro avg 0.74 0.70 0.72 1407
weighted avg 0.78 0.79 0.79 1407
AUC-ROC
============================================================
0.8255483483545667
plot_cmatrix(cm, name='NN')
model_NN1 = build_model(sgd)
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_3 (Dense) (None, 40) 1640 _________________________________________________________________ dense_4 (Dense) (None, 10) 410 _________________________________________________________________ dense_5 (Dense) (None, 1) 11 ================================================================= Total params: 2,061 Trainable params: 2,061 Non-trainable params: 0 _________________________________________________________________ None
# neural network with class weight for imbalanced data
cw={0: 1, 1: 3}
history1 = model_NN1.fit(train_X,train_y,
callbacks = [es],
validation_split=0.2,
class_weight=cw,
epochs=nepochs,
batch_size=nbsize,
verbose=0)
WARNING:tensorflow:From /Users/KL/opt/anaconda3/envs/mypy38/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:5043: calling gather (from tensorflow.python.ops.array_ops) with validate_indices is deprecated and will be removed in a future version. Instructions for updating: The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
plot_metrics(history1)
acc1 = history1.history['accuracy']
val_acc1 = history1.history['val_accuracy']
# average train accuracy, the HIGHEST train accuracy
print(np.mean(val_acc1), np.max(val_acc1))
0.6520592530568441 0.758222222328186
model_NN1.evaluate(test_X,test_y)
44/44 [==============================] - 0s 2ms/step - loss: 0.5028 - accuracy: 0.7498
[0.5027533173561096, 0.7498223185539246]
cm1,cr1,auc_score1,_ = test_eval(model_NN1,test_X,test_y)
fpr1, tpr1, _ = roc_curve(test_y, model_NN1.predict(test_X), pos_label = 1)
precision1, recall1, _ = precision_recall_curve(test_y, model_NN1.predict(test_X))
Confusion Matrix
============================================================
[[784 249]
[103 271]]
Classification Report
============================================================
precision recall f1-score support
0 0.88 0.76 0.82 1033
1 0.52 0.72 0.61 374
accuracy 0.75 1407
macro avg 0.70 0.74 0.71 1407
weighted avg 0.79 0.75 0.76 1407
AUC-ROC
============================================================
0.8246527687903463
plot_cmatrix(cm1,name='NN with weighted class')
After over-sampling the data, we now see that there is an equal number of data for customers who churned and who did not.
from imblearn.over_sampling import SMOTE
#from imblearn.combine import SMOTEENN
from collections import Counter
counter = Counter(train_y)
print('Before',counter)
# oversampling the train dataset using SMOTE
sm = SMOTE(random_state=1)
train_X_smote, train_y_smote = sm.fit_resample(train_X, train_y)
counter = Counter(train_y_smote)
print('After',counter)
Before Counter({0: 4130, 1: 1495})
After Counter({0: 4130, 1: 4130})
model_NN1 = build_model(sgd)
Model: "sequential_2" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= dense_6 (Dense) (None, 40) 1640 _________________________________________________________________ dense_7 (Dense) (None, 10) 410 _________________________________________________________________ dense_8 (Dense) (None, 1) 11 ================================================================= Total params: 2,061 Trainable params: 2,061 Non-trainable params: 0 _________________________________________________________________ None
history2 = model_NN1.fit(train_X_smote,train_y_smote,
callbacks = [es],
validation_split=0.2,
epochs=nepochs,
batch_size = nbsize,
verbose=0)
plot_metrics(history2)
#history_dict = history.history
acc2 = history2.history['accuracy']
val_acc2 = history2.history['val_accuracy']
# the average and HIGHEST train accuracy
print(np.mean(val_acc2),np.max(val_acc2))
0.749046931875513 0.9194915294647217
cm2,cr2,auc_score2,_ = test_eval(model_NN1,test_X,test_y)
fpr2, tpr2, _ = roc_curve(test_y, model_NN1.predict(test_X), pos_label = 1)
precision2, recall2, _ = precision_recall_curve(test_y, model_NN1.predict(test_X))
Confusion Matrix
============================================================
[[751 282]
[ 84 290]]
Classification Report
============================================================
precision recall f1-score support
0 0.90 0.73 0.80 1033
1 0.51 0.78 0.61 374
accuracy 0.74 1407
macro avg 0.70 0.75 0.71 1407
weighted avg 0.80 0.74 0.75 1407
AUC-ROC
============================================================
0.8324256746613105
np.mean(tpr2)
0.6166452486561925
plot_cmatrix(cm2,name='NN with oversampling')
test_y.value_counts()
0 1033 1 374 Name: Churn, dtype: int64
print(f'--- Precision ---')
print(f'Neural Network: {np.mean(precision)}')
print(f'Neural Network with weighted class: {np.mean(precision1)}')
print(f'Neural Network with oversampling technique: {np.mean(precision2)}')
print(f'--- Recall ---')
print(f'Neural Network: {np.mean(recall)}')
print(f'Neural Network with weighted class: {np.mean(recall1)}')
print(f'Neural Network with oversampling technique: {np.mean(recall2)}')
print(f'--- AUC Score ---')
print(f'Neural Network: {auc_score}')
print(f'Neural Network with weighted class: {auc_score1}')
print(f'Neural Network with oversampling technique: {auc_score2}')
--- Precision --- Neural Network: 0.4985446360837776 Neural Network with weighted class: 0.49795553030233397 Neural Network with oversampling technique: 0.5013334297061078 --- Recall --- Neural Network: 0.7308933304060095 Neural Network with weighted class: 0.7292126563649742 Neural Network with oversampling technique: 0.7363675540146128 --- AUC Score --- Neural Network: 0.8255483483545667 Neural Network with weighted class: 0.8246527687903463 Neural Network with oversampling technique: 0.8324256746613105
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr,y=tpr,mode='lines',name='NN'))
fig.add_trace(go.Scatter(x=fpr1,y=tpr1,mode='lines',name='NN with weighted class'))
fig.add_trace(go.Scatter(x=fpr2,y=tpr2,mode='lines',name='NN with oversampling'))
fig.update_layout(title='ROC',
xaxis_title='False Positive Rate (Sensitivity)',
yaxis_title='True Positive Rate (1 - Specificity)', title_x = 0.5,
width = 900, height=500,
)
fig.show()
fig = go.Figure()
fig.add_trace(go.Scatter(x=recall,y=precision,mode='lines',name='NN'))
fig.add_trace(go.Scatter(x=recall1,y=precision1,mode='lines',name='NN with weighted class'))
fig.add_trace(go.Scatter(x=recall2,y=precision2,mode='lines',name='NN with oversampling'))
fig.update_layout(title='Precision-Recall Curve',
xaxis_title='Recall',
yaxis_title='Precision', title_x = 0.5,
width = 900, height=450,
)
fig.show()
When using SMOTE to balance the data class, result shows that the recall score is increased, although the precision score is slightly decreased. This means that the model performance to correctly predict the minority class label is getting better by using SMOTE method to balance the data.
EDA results show that customers with low tenure (first 6 months) had higher churn rate and the churn rate decreases as the tenure increases.
In the encoding section, One hot encoding is used to transform categorical data and MinMaxScaler is used to scale numerical data. Thus, the performance results could be somewhat different if the label encoder and another scaling method were used instead.
This Telcom Churn Prediction dataset is a class imbalance classification problem. There are significantly less customers who churned then those who did not. Since the column 'Churn' is the target variable in this project, this will cause an issue with fully connected Neural Network (NN) -- the model will be biased towards predicting the majority class (No Churn) because there is not enough data to learn the patterns present in the minority class (Churn). Hence, there will be high misclassification errors for the minority class.
To fix this issue we will use
Performance metrics of these different approches are compared and evaluated against NN. Results show that we are able to predict better which customers are likely to churn using the fully connected neural network with upsampling SMOTE method.